The Thera bank recently saw a steep decline in the number of users of their credit card, credit cards are a good source of income for banks because of different kinds of fees charged by the banks like annual fees, balance transfer fees, and cash advance fees, late payment fees, foreign transaction fees, and others. Some fees are charged to every user irrespective of usage, while others are charged under specified circumstances.
The data scientist at Thera bank needs to come up with a classification model that will help the bank improve its services so that customers do not renounce their credit cards
Objectives
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# To tune model, get different metric scores and split data
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
)
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
BaggingClassifier,
)
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
# To impute missing values
from sklearn.impute import KNNImputer
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# To bin continuous data into intervals
from sklearn.preprocessing import KBinsDiscretizer
# To build a logistic regression model
from sklearn.linear_model import LogisticRegression
# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
# To be used for missing value imputation
from sklearn.impute import SimpleImputer
# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import copy
# To suppress the warnings
import warnings
warnings.filterwarnings("ignore")
# This will help in making the Python code more structured automatically (good coding practice)
%load_ext nb_black
bank = pd.read_csv("BankChurners.csv")
# Checking the number of rows and columns in the data
print("The dataset has", bank.shape[0], "rows and", bank.shape[1], "columns")
The dataset has 10127 rows and 21 columns
# let's create a copy of the data
df = bank.copy()
# let's view the first 5 rows of the data
df.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | ... | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
5 rows × 21 columns
# let's view the last 5 rows of the data
df.tail()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10122 | 772366833 | Existing Customer | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | ... | 2 | 3 | 4003.0 | 1851 | 2152.0 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | 710638233 | Attrited Customer | 41 | M | 2 | NaN | Divorced | $40K - $60K | Blue | 25 | ... | 2 | 3 | 4277.0 | 2186 | 2091.0 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | 716506083 | Attrited Customer | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | ... | 3 | 4 | 5409.0 | 0 | 5409.0 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | 717406983 | Attrited Customer | 30 | M | 2 | Graduate | NaN | $40K - $60K | Blue | 36 | ... | 3 | 3 | 5281.0 | 0 | 5281.0 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | 714337233 | Attrited Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | ... | 2 | 4 | 10388.0 | 1961 | 8427.0 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
5 rows × 21 columns
# let's check the data types of the columns in the dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
# let's check for duplicate values in the data
df.duplicated().sum()
0
# let's check for missing values in the data
round(df.isnull().sum() / df.isnull().count() * 100, 2)
CLIENTNUM 0.0 Attrition_Flag 0.0 Customer_Age 0.0 Gender 0.0 Dependent_count 0.0 Education_Level 15.0 Marital_Status 7.4 Income_Category 0.0 Card_Category 0.0 Months_on_book 0.0 Total_Relationship_Count 0.0 Months_Inactive_12_mon 0.0 Contacts_Count_12_mon 0.0 Credit_Limit 0.0 Total_Revolving_Bal 0.0 Avg_Open_To_Buy 0.0 Total_Amt_Chng_Q4_Q1 0.0 Total_Trans_Amt 0.0 Total_Trans_Ct 0.0 Total_Ct_Chng_Q4_Q1 0.0 Avg_Utilization_Ratio 0.0 dtype: float64
Education_Level column has 15% missing values out of the total observations.Marital_Status column has 7.4% missing values out of the total observations.# let's view the statistical summary of the numerical columns in the data
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CLIENTNUM | 10127.0 | 7.391776e+08 | 3.690378e+07 | 708082083.0 | 7.130368e+08 | 7.179264e+08 | 7.731435e+08 | 8.283431e+08 |
| Customer_Age | 10127.0 | 4.632596e+01 | 8.016814e+00 | 26.0 | 4.100000e+01 | 4.600000e+01 | 5.200000e+01 | 7.300000e+01 |
| Dependent_count | 10127.0 | 2.346203e+00 | 1.298908e+00 | 0.0 | 1.000000e+00 | 2.000000e+00 | 3.000000e+00 | 5.000000e+00 |
| Months_on_book | 10127.0 | 3.592841e+01 | 7.986416e+00 | 13.0 | 3.100000e+01 | 3.600000e+01 | 4.000000e+01 | 5.600000e+01 |
| Total_Relationship_Count | 10127.0 | 3.812580e+00 | 1.554408e+00 | 1.0 | 3.000000e+00 | 4.000000e+00 | 5.000000e+00 | 6.000000e+00 |
| Months_Inactive_12_mon | 10127.0 | 2.341167e+00 | 1.010622e+00 | 0.0 | 2.000000e+00 | 2.000000e+00 | 3.000000e+00 | 6.000000e+00 |
| Contacts_Count_12_mon | 10127.0 | 2.455317e+00 | 1.106225e+00 | 0.0 | 2.000000e+00 | 2.000000e+00 | 3.000000e+00 | 6.000000e+00 |
| Credit_Limit | 10127.0 | 8.631954e+03 | 9.088777e+03 | 1438.3 | 2.555000e+03 | 4.549000e+03 | 1.106750e+04 | 3.451600e+04 |
| Total_Revolving_Bal | 10127.0 | 1.162814e+03 | 8.149873e+02 | 0.0 | 3.590000e+02 | 1.276000e+03 | 1.784000e+03 | 2.517000e+03 |
| Avg_Open_To_Buy | 10127.0 | 7.469140e+03 | 9.090685e+03 | 3.0 | 1.324500e+03 | 3.474000e+03 | 9.859000e+03 | 3.451600e+04 |
| Total_Amt_Chng_Q4_Q1 | 10127.0 | 7.599407e-01 | 2.192068e-01 | 0.0 | 6.310000e-01 | 7.360000e-01 | 8.590000e-01 | 3.397000e+00 |
| Total_Trans_Amt | 10127.0 | 4.404086e+03 | 3.397129e+03 | 510.0 | 2.155500e+03 | 3.899000e+03 | 4.741000e+03 | 1.848400e+04 |
| Total_Trans_Ct | 10127.0 | 6.485869e+01 | 2.347257e+01 | 10.0 | 4.500000e+01 | 6.700000e+01 | 8.100000e+01 | 1.390000e+02 |
| Total_Ct_Chng_Q4_Q1 | 10127.0 | 7.122224e-01 | 2.380861e-01 | 0.0 | 5.820000e-01 | 7.020000e-01 | 8.180000e-01 | 3.714000e+00 |
| Avg_Utilization_Ratio | 10127.0 | 2.748936e-01 | 2.756915e-01 | 0.0 | 2.300000e-02 | 1.760000e-01 | 5.030000e-01 | 9.990000e-01 |
Observations
CLIENTNUM: CLIENTNUM being a unique identifier doesn't add any meaning to the analysis and can be dropped.Customer_Age: Customer_Age varies between 26 and 73 (years), with 50% customers in the 41-52 range.Dependent_count: Dependent_count varies between 0 and 5, with 75% customers having 3 or fewer dependents.Months_on_book: Months_on_book varies between 13 and 56. Total_Relationship_Count: Total_Relationship_Count varies between 1 and 6.Months_Inactive_12_mon: Months_Inactive_12_mon varies between 0 and 6, with 75% customers having 3 or fewer dependents.Contacts_Count_12_mon: Contacts_Count_12_mon varies between 0 and 6.Credit_Limit: Credit_Limit varies between 1438.3 and 34516.Total_Revolving_Bal: Total_Revolving_Bal varies between 0 and 2517.Avg_Open_To_Buy: Avg_Open_To_Buy varies between 3 and 34516.Total_Amt_Chng_Q4_Q1: Total_Amt_Chng_Q4_Q1 varies between 0 and 3.397.Total_Trans_Amt: Total_Trans_Amt varies between 510 and 18484.Total_Trans_Ct: Total_Trans_Ct varies between 10 and 139.Total_Ct_Chng_Q4_Q1: Total_Ct_Chng_Q4_Q1 varies between 0 and 3.714.Avg_Utilization_Ratio: Avg_Utilization_Ratio varies between 0 and 0.999.# let's view the statistical summary of the non-numerical columns in the data
df.describe(exclude=np.number).T
| count | unique | top | freq | |
|---|---|---|---|---|
| Attrition_Flag | 10127 | 2 | Existing Customer | 8500 |
| Gender | 10127 | 2 | F | 5358 |
| Education_Level | 8608 | 6 | Graduate | 3128 |
| Marital_Status | 9378 | 3 | Married | 4687 |
| Income_Category | 10127 | 6 | Less than $40K | 3561 |
| Card_Category | 10127 | 4 | Blue | 9436 |
Observations
# list of all categorical variables
cat_col = df.select_dtypes(['object']).columns
# printing the number of occurrences of each unique value in each categorical column
for column in cat_col:
print(df[column].value_counts(dropna=False))
print("-" * 50)
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64 -------------------------------------------------- F 5358 M 4769 Name: Gender, dtype: int64 -------------------------------------------------- Graduate 3128 High School 2013 NaN 1519 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 -------------------------------------------------- Married 4687 Single 3943 NaN 749 Divorced 748 Name: Marital_Status, dtype: int64 -------------------------------------------------- Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 abc 1112 $120K + 727 Name: Income_Category, dtype: int64 -------------------------------------------------- Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64 --------------------------------------------------
Attrition_Flag: Attrition_Flag is split between 'Existing Customer' and 'Attrited Customer' at 84% and 16% respectively. There is a class imbalance. So we will likely need to look into the benefits of oversampling and undersampling.Gender: Gender is split almost evenly between 'F' and 'M' (53% and 47% respectively)Education_Level: The most common Education_Level is 'Graduate' and the least common value is 'Doctorate'. There are 1519 missing values, which is a large percent. There is an ordinal relationship between the various education levels with Uneducated < High School < College < Graduate < Post-Graduate < DoctorateMarital_Status: The most common Marital_Status is 'Married' and the least common value is 'Divorced'. There are 749 missing values, which is a large percent.Income_Category: The most common Income_Category is 'Less than $40K' and the least common value is '120K +'. There are also 1112 rows with 'abc'. These should be treated as missing.Card_Category: The most common Card_Category is 'Blue' and the least common value is 'Platinum'. Though not stated in the problem statement, there is an ordinal relation among the card catories. With Blue < Silver < Gold < Platinum # CLIENTNUM is unique for each record and might not add value to modeling
df.drop(["CLIENTNUM"], axis=1, inplace=True)
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# Let's see the plots on numerical columns
num_col = df.select_dtypes(['int64','float64']).columns
for col in num_col:
histogram_boxplot(df, col)
Observations
Customer_Age: Customer_Age is normally distributed centered at 47 and no outliers.Dependent_count: Dependent_count is normally distributed centered at 2.2 with no outliers.Months_on_book: Months_on_book is fairly normally distributed but there's a peak at 36 that is interesting and should be investigated further. There are some outliers in both sides.Total_Relationship_Count: Total_Relationship_Count varies between 1 and 6 with a peak at 3. There are no outliers.Months_Inactive_12_mon: Months_Inactive_12_mon varies between 0 and 6, with a peak at 3. There are some outliers in both sides.Contacts_Count_12_mon: Contacts_Count_12_mon varies between 0 and 6, with a peak at 3. There are some outliers in both sides.Credit_Limit: Credit_Limit is right skewed with many outliers on the higher end. The skew could be reduced by doing a log on the column. This should be checked more.Total_Revolving_Bal: Total_Revolving_Bal is normally distributed except for the value of 0 and the maximum value. Avg_Open_To_Buy: Avg_Open_To_Buy is right skewed with many outliers on the higher end. The skew could be reduced by doing a log on the column. This should be checked more.Total_Amt_Chng_Q4_Q1: Total_Amt_Chng_Q4_Q1 is normally distributed but has a large number of outliers on both sides.Total_Trans_Amt: Total_Trans_Amt seems to have four different clusters. Each cluster is a mini normal curve. This is interesting and should be investigated more.Total_Trans_Ct: Total_Trans_Ct seems to have two different clusters, each of which appears to be normally distributed. There are outliers on the high end.Total_Ct_Chng_Q4_Q1: Total_Ct_Chng_Q4_Q1 has a fairly normal distribution but has a large number of outliers on the high end.Avg_Utilization_Ratio: Avg_Utilization_Ratio is right skewed with many rows at 0.# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
# Let's see the plots on numerical columns
cat_col = df.select_dtypes(['object']).columns
for col in cat_col:
labeled_barplot(df, col)
Observations made earlier are confirmed here using bar plots.
Attrition_Flag: Attrition_Flag is split between 'Existing Customer' and 'Attrited Customer' at 84% and 16% respectively. There is a class imbalance. So we will likely need to look into the benefits of oversampling and undersampling.Gender: Gender is split almost evenly between 'F' and 'M' (53% and 47% respectively)Education_Level: The most common Education_Level is 'Graduate' and the least common value is 'Doctorate'. There are 1519 missing values, which is a large percent. There is an ordinal relationship between the various education levels with Uneducated < High School < College < Graduate < Post-Graduate < DoctorateMarital_Status: The most common Marital_Status is 'Married' and the least common value is 'Divorced'. There are 749 missing values, which is a large percent.Income_Category: The most common Income_Category is 'Less than $40K' and the least common value is '120K +'. There are also 1112 rows with 'abc'. These should be treated as missing.Card_Category: The most common Card_Category is 'Blue' and the least common value is 'Platinum'. Though not stated in the problem statement, there is an ordinal relation among the card catories. With Blue < Silver < Gold < Platinumplt.figure(figsize=(15, 7))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
sns.pairplot(data=df, diag_kind="kde")
plt.show()
df["Attrition_Flag"].value_counts()
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64
df["Attrition_Flag"].replace("Existing Customer",0, inplace=True)
df["Attrition_Flag"].replace("Attrited Customer",1, inplace=True)
df["Attrition_Flag"].value_counts()
0 8500 1 1627 Name: Attrition_Flag, dtype: int64
# Let's look at the distribution before modifying
cols = ['Income_Category','Education_Level','Card_Category']
for col in cols:
print(df[col].value_counts(dropna=False))
Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 abc 1112 $120K + 727 Name: Income_Category, dtype: int64 Graduate 3128 High School 2013 NaN 1519 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64
# Let's do a feature engineering, before continuing analysis.
df['Income_Category'][df['Income_Category'] == 'abc'] = np.nan
replaceStruct = {
"Education_Level":{"Uneducated": 1, "High School": 2 ,"College": 3 ,"Graduate":4,"Post-Graduate":5, "Doctorate":6},
"Income_Category": {"Less than $40K": 1, "$40K - $60K":2 , "$60K - $80K": 3, "$80K - $120K": 4,"$120K +":5},
"Card_Category": {"Blue": 1, "Silver":2 , "Gold": 3, "Platinum": 4}
}
df=df.replace(replaceStruct)
# Let's verify the distribution by comparing with above.
cols = ['Income_Category','Education_Level','Card_Category']
for col in cols:
print(df[col].value_counts(dropna=False))
1.0 3561 2.0 1790 4.0 1535 3.0 1402 NaN 1112 5.0 727 Name: Income_Category, dtype: int64 4.0 3128 2.0 2013 NaN 1519 1.0 1487 3.0 1013 5.0 516 6.0 451 Name: Education_Level, dtype: int64 1 9436 2 555 3 116 4 20 Name: Card_Category, dtype: int64
sns.pairplot(data=df, diag_kind="kde", hue="Attrition_Flag")
plt.show()
Observations
# Let's redo the correlation plot since we now have the target variable as numeric
plt.figure(figsize=(15, 7))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 1, 5))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null int64 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null object 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null float64 5 Marital_Status 9378 non-null object 6 Income_Category 9015 non-null float64 7 Card_Category 10127 non-null int64 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(7), int64(11), object(2) memory usage: 1.5+ MB
df.columns
Index(['Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count',
'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category',
'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],
dtype='object')
# Let's continue the bivariate analysis with stacked bar plots
cols = ['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category','Dependent_count']
for col1 in cols:
for col2 in cols:
if(col1 < col2):
stacked_barplot(df, col1, col2)
Gender F M All Attrition_Flag All 5358 4769 10127 0 4428 4072 8500 1 930 697 1627 ------------------------------------------------------------------------------------------------------------------------
Education_Level 1.0 2.0 3.0 4.0 5.0 6.0 All Attrition_Flag All 1487 2013 1013 3128 516 451 8608 0 1250 1707 859 2641 424 356 7237 1 237 306 154 487 92 95 1371 ------------------------------------------------------------------------------------------------------------------------
Marital_Status Divorced Married Single All Attrition_Flag All 748 4687 3943 9378 0 627 3978 3275 7880 1 121 709 668 1498 ------------------------------------------------------------------------------------------------------------------------
Income_Category 1.0 2.0 3.0 4.0 5.0 All Attrition_Flag All 3561 1790 1402 1535 727 9015 0 2949 1519 1213 1293 601 7575 1 612 271 189 242 126 1440 ------------------------------------------------------------------------------------------------------------------------
Card_Category 1 2 3 4 All Attrition_Flag All 9436 555 116 20 10127 0 7917 473 95 15 8500 1 1519 82 21 5 1627 ------------------------------------------------------------------------------------------------------------------------
Dependent_count 0 1 2 3 4 5 All Attrition_Flag All 904 1838 2655 2732 1574 424 10127 0 769 1569 2238 2250 1314 360 8500 1 135 269 417 482 260 64 1627 ------------------------------------------------------------------------------------------------------------------------
Marital_Status Divorced Married Single All Gender All 748 4687 3943 9378 F 402 2451 2125 4978 M 346 2236 1818 4400 ------------------------------------------------------------------------------------------------------------------------
Income_Category 1.0 2.0 3.0 4.0 5.0 All Gender M 277 776 1402 1535 727 4717 All 3561 1790 1402 1535 727 9015 F 3284 1014 0 0 0 4298 ------------------------------------------------------------------------------------------------------------------------
Gender F M All Education_Level All 4546 4062 8608 4.0 1670 1458 3128 2.0 1028 985 2013 1.0 796 691 1487 3.0 532 481 1013 5.0 263 253 516 6.0 257 194 451 ------------------------------------------------------------------------------------------------------------------------
Marital_Status Divorced Married Single All Education_Level All 652 3999 3322 7973 4.0 225 1479 1197 2901 1.0 136 656 586 1378 2.0 128 949 782 1859 3.0 86 467 386 939 5.0 41 243 189 473 6.0 36 205 182 423 ------------------------------------------------------------------------------------------------------------------------
Income_Category 1.0 2.0 3.0 4.0 5.0 All Education_Level All 3005 1521 1192 1316 607 7641 4.0 1139 553 422 478 204 2796 2.0 671 355 307 308 147 1788 1.0 522 249 195 217 119 1302 3.0 345 183 132 175 70 905 6.0 158 70 59 57 37 381 5.0 170 111 77 81 30 469 ------------------------------------------------------------------------------------------------------------------------
Marital_Status Divorced Married Single All Income_Category All 655 4194 3499 8348 1.0 254 1628 1429 3311 2.0 138 816 704 1658 3.0 108 661 531 1300 4.0 103 735 561 1399 5.0 52 354 274 680 ------------------------------------------------------------------------------------------------------------------------
Gender F M All Card_Category All 5358 4769 10127 1 5101 4335 9436 2 210 345 555 3 38 78 116 4 9 11 20 ------------------------------------------------------------------------------------------------------------------------
Education_Level 1.0 2.0 3.0 4.0 5.0 6.0 All Card_Category All 1487 2013 1013 3128 516 451 8608 1 1391 1888 940 2899 476 422 8016 2 82 98 58 185 32 22 477 3 11 26 15 36 5 5 98 4 3 1 0 8 3 2 17 ------------------------------------------------------------------------------------------------------------------------
Marital_Status Divorced Married Single All Card_Category All 748 4687 3943 9378 1 696 4433 3624 8753 2 46 206 251 503 3 5 41 58 104 4 1 7 10 18 ------------------------------------------------------------------------------------------------------------------------
Income_Category 1.0 2.0 3.0 4.0 5.0 All Card_Category All 3561 1790 1402 1535 727 9015 1 3403 1675 1273 1395 645 8391 2 130 99 96 117 60 502 3 24 15 29 21 18 107 4 4 1 4 2 4 15 ------------------------------------------------------------------------------------------------------------------------
Dependent_count 0 1 2 3 4 5 All Card_Category All 904 1838 2655 2732 1574 424 10127 1 839 1739 2483 2546 1441 388 9436 2 55 84 138 148 100 30 555 3 8 14 28 30 30 6 116 4 2 1 6 8 3 0 20 ------------------------------------------------------------------------------------------------------------------------
Gender F M All Dependent_count All 5358 4769 10127 3 1416 1316 2732 2 1388 1267 2655 1 996 842 1838 4 849 725 1574 0 484 420 904 5 225 199 424 ------------------------------------------------------------------------------------------------------------------------
Education_Level 1.0 2.0 3.0 4.0 5.0 6.0 All Dependent_count All 1487 2013 1013 3128 516 451 8608 2 396 503 272 807 139 130 2247 3 396 572 285 852 142 112 2359 1 279 372 186 572 77 72 1558 4 235 290 160 494 83 69 1331 0 121 196 77 278 50 47 769 5 60 80 33 125 25 21 344 ------------------------------------------------------------------------------------------------------------------------
Marital_Status Divorced Married Single All Dependent_count All 748 4687 3943 9378 3 215 1251 1027 2493 2 204 1284 1003 2491 4 125 727 587 1439 1 105 832 770 1707 0 75 387 399 861 5 24 206 157 387 ------------------------------------------------------------------------------------------------------------------------
Income_Category 1.0 2.0 3.0 4.0 5.0 All Dependent_count All 3561 1790 1402 1535 727 9015 2 900 455 376 404 242 2377 3 896 474 398 473 201 2442 1 693 341 225 255 116 1630 4 535 271 220 266 108 1400 5 148 75 74 56 31 384 0 389 174 109 81 29 782 ------------------------------------------------------------------------------------------------------------------------
# Let's continue the bivariate analysis with stacked bar plots. This time separate attrition and no attrition cases.
cols = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category','Dependent_count']
for col1 in cols:
for col2 in cols:
if(col1 < col2):
print("\nNo attrition - ",col1,"vs",col2,"\n")
stacked_barplot(df[df["Attrition_Flag"]==0], col1, col2)
print("\nAttrition - ",col1,"vs",col2,"\n")
stacked_barplot(df[df["Attrition_Flag"]==1], col1, col2)
No attrition - Gender vs Marital_Status Marital_Status Divorced Married Single All Gender All 627 3978 3275 7880 F 334 2039 1744 4117 M 293 1939 1531 3763 ------------------------------------------------------------------------------------------------------------------------
Attrition - Gender vs Marital_Status Marital_Status Divorced Married Single All Gender All 121 709 668 1498 F 68 412 381 861 M 53 297 287 637 ------------------------------------------------------------------------------------------------------------------------
No attrition - Gender vs Income_Category Income_Category 1.0 2.0 3.0 4.0 5.0 All Gender M 247 671 1213 1293 601 4025 All 2949 1519 1213 1293 601 7575 F 2702 848 0 0 0 3550 ------------------------------------------------------------------------------------------------------------------------
Attrition - Gender vs Income_Category Income_Category 1.0 2.0 3.0 4.0 5.0 All Gender M 30 105 189 242 126 692 All 612 271 189 242 126 1440 F 582 166 0 0 0 748 ------------------------------------------------------------------------------------------------------------------------
No attrition - Education_Level vs Gender Gender F M All Education_Level All 3767 3470 7237 4.0 1377 1264 2641 2.0 856 851 1707 1.0 669 581 1250 3.0 453 406 859 5.0 219 205 424 6.0 193 163 356 ------------------------------------------------------------------------------------------------------------------------
Attrition - Education_Level vs Gender Gender F M All Education_Level All 779 592 1371 4.0 293 194 487 2.0 172 134 306 1.0 127 110 237 3.0 79 75 154 5.0 44 48 92 6.0 64 31 95 ------------------------------------------------------------------------------------------------------------------------
No attrition - Education_Level vs Marital_Status Marital_Status Divorced Married Single All Education_Level All 547 3402 2767 6716 4.0 189 1266 995 2450 1.0 116 563 484 1163 2.0 106 812 664 1582 3.0 75 396 323 794 5.0 33 206 154 393 6.0 28 159 147 334 ------------------------------------------------------------------------------------------------------------------------
Attrition - Education_Level vs Marital_Status Marital_Status Divorced Married Single All Education_Level All 105 597 555 1257 4.0 36 213 202 451 2.0 22 137 118 277 1.0 20 93 102 215 3.0 11 71 63 145 5.0 8 37 35 80 6.0 8 46 35 89 ------------------------------------------------------------------------------------------------------------------------
No attrition - Education_Level vs Income_Category Income_Category 1.0 2.0 3.0 4.0 5.0 All Education_Level All 2489 1300 1034 1107 500 6430 4.0 929 476 370 411 172 2358 2.0 570 304 265 267 119 1525 1.0 442 208 171 180 96 1097 3.0 296 159 115 139 58 767 6.0 116 59 52 46 29 302 5.0 136 94 61 64 26 381 ------------------------------------------------------------------------------------------------------------------------
Attrition - Education_Level vs Income_Category Income_Category 1.0 2.0 3.0 4.0 5.0 All Education_Level All 516 221 158 209 107 1211 4.0 210 77 52 67 32 438 2.0 101 51 42 41 28 263 1.0 80 41 24 37 23 205 3.0 49 24 17 36 12 138 6.0 42 11 7 11 8 79 5.0 34 17 16 17 4 88 ------------------------------------------------------------------------------------------------------------------------
No attrition - Income_Category vs Marital_Status Marital_Status Divorced Married Single All Income_Category All 548 3563 2909 7020 1.0 208 1365 1173 2746 2.0 118 689 599 1406 3.0 98 575 453 1126 4.0 81 632 469 1182 5.0 43 302 215 560 ------------------------------------------------------------------------------------------------------------------------
Attrition - Income_Category vs Marital_Status Marital_Status Divorced Married Single All Income_Category All 107 631 590 1328 1.0 46 263 256 565 4.0 22 103 92 217 2.0 20 127 105 252 3.0 10 86 78 174 5.0 9 52 59 120 ------------------------------------------------------------------------------------------------------------------------
No attrition - Card_Category vs Gender Gender F M All Card_Category All 4428 4072 8500 1 4211 3706 7917 2 182 291 473 3 30 65 95 4 5 10 15 ------------------------------------------------------------------------------------------------------------------------
Attrition - Card_Category vs Gender Gender F M All Card_Category All 930 697 1627 1 890 629 1519 2 28 54 82 3 8 13 21 4 4 1 5 ------------------------------------------------------------------------------------------------------------------------
No attrition - Card_Category vs Education_Level Education_Level 1.0 2.0 3.0 4.0 5.0 6.0 All Card_Category All 1250 1707 859 2641 424 356 7237 1 1169 1601 797 2449 393 331 6740 2 71 83 48 157 25 20 404 3 7 22 14 30 4 4 81 4 3 1 0 5 2 1 12 ------------------------------------------------------------------------------------------------------------------------
Attrition - Card_Category vs Education_Level Education_Level 1.0 2.0 3.0 4.0 5.0 6.0 All Card_Category All 237 306 154 487 92 95 1371 1 222 287 143 450 83 91 1276 2 11 15 10 28 7 2 73 3 4 4 1 6 1 1 17 4 0 0 0 3 1 1 5 ------------------------------------------------------------------------------------------------------------------------
No attrition - Card_Category vs Marital_Status Marital_Status Divorced Married Single All Card_Category All 627 3978 3275 7880 1 582 3759 3013 7354 2 39 179 208 426 3 5 35 47 87 4 1 5 7 13 ------------------------------------------------------------------------------------------------------------------------
Attrition - Card_Category vs Marital_Status Marital_Status Divorced Married Single All Card_Category All 121 709 668 1498 1 114 674 611 1399 2 7 27 43 77 3 0 6 11 17 4 0 2 3 5 ------------------------------------------------------------------------------------------------------------------------
No attrition - Card_Category vs Income_Category Income_Category 1.0 2.0 3.0 4.0 5.0 All Card_Category All 2949 1519 1213 1293 601 7575 1 2817 1418 1101 1180 532 7048 2 110 88 85 95 49 427 3 20 13 23 16 16 88 4 2 0 4 2 4 12 ------------------------------------------------------------------------------------------------------------------------
Attrition - Card_Category vs Income_Category Income_Category 1.0 2.0 3.0 4.0 5.0 All Card_Category All 612 271 189 242 126 1440 1 586 257 172 215 113 1343 2 20 11 11 22 11 75 3 4 2 6 5 2 19 4 2 1 0 0 0 3 ------------------------------------------------------------------------------------------------------------------------
No attrition - Card_Category vs Dependent_count Dependent_count 0 1 2 3 4 5 All Card_Category All 769 1569 2238 2250 1314 360 8500 1 713 1483 2098 2090 1203 330 7917 2 48 74 115 127 84 25 473 3 7 11 21 26 25 5 95 4 1 1 4 7 2 0 15 ------------------------------------------------------------------------------------------------------------------------
Attrition - Card_Category vs Dependent_count Dependent_count 0 1 2 3 4 5 All Card_Category All 135 269 417 482 260 64 1627 1 126 256 385 456 238 58 1519 2 7 10 23 21 16 5 82 3 1 3 7 4 5 1 21 4 1 0 2 1 1 0 5 ------------------------------------------------------------------------------------------------------------------------
No attrition - Dependent_count vs Gender Gender F M All Dependent_count All 4428 4072 8500 3 1144 1106 2250 2 1160 1078 2238 1 827 742 1569 4 695 619 1314 0 407 362 769 5 195 165 360 ------------------------------------------------------------------------------------------------------------------------
Attrition - Dependent_count vs Gender Gender F M All Dependent_count All 930 697 1627 3 272 210 482 2 228 189 417 4 154 106 260 1 169 100 269 0 77 58 135 5 30 34 64 ------------------------------------------------------------------------------------------------------------------------
No attrition - Dependent_count vs Education_Level Education_Level 1.0 2.0 3.0 4.0 5.0 6.0 All Dependent_count All 1250 1707 859 2641 424 356 7237 2 331 434 230 677 121 105 1898 3 327 477 237 712 115 82 1950 1 236 324 159 494 57 64 1334 4 198 246 140 409 70 55 1118 0 108 162 64 240 40 33 647 5 50 64 29 109 21 17 290 ------------------------------------------------------------------------------------------------------------------------
Attrition - Dependent_count vs Education_Level Education_Level 1.0 2.0 3.0 4.0 5.0 6.0 All Dependent_count All 237 306 154 487 92 95 1371 3 69 95 48 140 27 30 409 1 43 48 27 78 20 8 224 2 65 69 42 130 18 25 349 4 37 44 20 85 13 14 213 0 13 34 13 38 10 14 122 5 10 16 4 16 4 4 54 ------------------------------------------------------------------------------------------------------------------------
No attrition - Dependent_count vs Marital_Status Marital_Status Divorced Married Single All Dependent_count All 627 3978 3275 7880 3 182 1027 841 2050 2 170 1099 835 2104 4 106 603 497 1206 1 87 733 637 1457 0 62 341 330 733 5 20 175 135 330 ------------------------------------------------------------------------------------------------------------------------
Attrition - Dependent_count vs Marital_Status Marital_Status Divorced Married Single All Dependent_count All 121 709 668 1498 2 34 185 168 387 3 33 224 186 443 4 19 124 90 233 1 18 99 133 250 0 13 46 69 128 5 4 31 22 57 ------------------------------------------------------------------------------------------------------------------------
No attrition - Dependent_count vs Income_Category Income_Category 1.0 2.0 3.0 4.0 5.0 All Dependent_count All 2949 1519 1213 1293 601 7575 2 754 390 329 344 194 2011 3 724 395 341 389 170 2019 1 578 295 200 219 100 1392 4 436 224 193 224 89 1166 5 128 63 61 46 26 324 0 329 152 89 71 22 663 ------------------------------------------------------------------------------------------------------------------------
Attrition - Dependent_count vs Income_Category Income_Category 1.0 2.0 3.0 4.0 5.0 All Dependent_count All 612 271 189 242 126 1440 2 146 65 47 60 48 366 3 172 79 57 84 31 423 4 99 47 27 42 19 234 1 115 46 25 36 16 238 0 60 22 20 10 7 119 5 20 12 13 10 5 60 ------------------------------------------------------------------------------------------------------------------------
cols1 = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
cols2 = ['Customer_Age', 'Dependent_count','Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon','Contacts_Count_12_mon', 'Total_Revolving_Bal','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt','Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
df_attrition = df[df["Attrition_Flag"]==1]
df_no_attrition = df[df["Attrition_Flag"]==0]
for col1 in cols1:
for col2 in cols2:
plt.figure(figsize=(10, 10))
plt.subplot(1, 2, 1)
sns.boxplot(data=df_no_attrition, x=df_no_attrition[col1], y=df_no_attrition[col2], palette="Blues");
plt.legend(labels=["No attrition"])
plt.subplot(1, 2, 2)
sns.boxplot(data=df_attrition, x=df_attrition[col1], y=df_attrition[col2], palette="Blues");
plt.legend(labels=["Attrition"])
plt.show()
### Function to plot distributions
def distribution_plot_wrt_target(data, predictor, target):
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
target_uniq = data[target].unique()
axs[0, 0].set_title("Distribution of target for target=" + str(target_uniq[0]))
sns.histplot(
data=data[data[target] == target_uniq[0]],
x=predictor,
kde=True,
ax=axs[0, 0],
color="teal",
)
axs[0, 1].set_title("Distribution of target for target=" + str(target_uniq[1]))
sns.histplot(
data=data[data[target] == target_uniq[1]],
x=predictor,
kde=True,
ax=axs[0, 1],
color="orange",
)
axs[1, 0].set_title("Boxplot w.r.t target")
sns.boxplot(data=data, x=target, y=predictor, ax=axs[1, 0], palette="gist_rainbow")
axs[1, 1].set_title("Boxplot (without outliers) w.r.t target")
sns.boxplot(
data=data,
x=target,
y=predictor,
ax=axs[1, 1],
showfliers=False,
palette="gist_rainbow",
)
plt.tight_layout()
plt.show()
# Let's look at boxplots and histograms of the numeric attributes splitting it on attrition and no attrition
cols = ['Customer_Age', 'Dependent_count','Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon','Contacts_Count_12_mon', 'Total_Revolving_Bal','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt','Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
for col in cols:
distribution_plot_wrt_target(df, col, "Attrition_Flag")
# Let's look at numerical distributions split by the target variable as a simple table
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 10000)
print('No attrition')
print(df[df['Attrition_Flag']==0].describe().T)
print('\nAttrition')
print(df[df['Attrition_Flag']==1].describe().T)
No attrition
count mean std min 25% 50% 75% max
Attrition_Flag 8500.0 0.000000 0.000000 0.000 0.000 0.000 0.00000 0.000
Customer_Age 8500.0 46.262118 8.081157 26.000 41.000 46.000 52.00000 73.000
Dependent_count 8500.0 2.335412 1.303229 0.000 1.000 2.000 3.00000 5.000
Education_Level 7237.0 3.048363 1.402392 1.000 2.000 3.000 4.00000 6.000
Income_Category 7575.0 2.350231 1.350500 1.000 1.000 2.000 3.50000 5.000
Card_Category 8500.0 1.083294 0.330621 1.000 1.000 1.000 1.00000 4.000
Months_on_book 8500.0 35.880588 8.021810 13.000 31.000 36.000 40.00000 56.000
Total_Relationship_Count 8500.0 3.914588 1.528949 1.000 3.000 4.000 5.00000 6.000
Months_Inactive_12_mon 8500.0 2.273765 1.016741 0.000 1.000 2.000 3.00000 6.000
Contacts_Count_12_mon 8500.0 2.356353 1.081436 0.000 2.000 2.000 3.00000 5.000
Credit_Limit 8500.0 8726.877518 9084.969807 1438.300 2602.000 4643.500 11252.75000 34516.000
Total_Revolving_Bal 8500.0 1256.604118 757.745354 0.000 800.000 1364.000 1807.00000 2517.000
Avg_Open_To_Buy 8500.0 7470.273400 9087.671862 15.000 1184.500 3469.500 9978.25000 34516.000
Total_Amt_Chng_Q4_Q1 8500.0 0.772510 0.217783 0.256 0.643 0.743 0.86000 3.397
Total_Trans_Amt 8500.0 4654.655882 3512.772635 816.000 2384.750 4100.000 4781.25000 18484.000
Total_Trans_Ct 8500.0 68.672588 22.919011 11.000 54.000 71.000 82.00000 139.000
Total_Ct_Chng_Q4_Q1 8500.0 0.742434 0.228054 0.028 0.617 0.721 0.83300 3.714
Avg_Utilization_Ratio 8500.0 0.296412 0.272568 0.000 0.055 0.211 0.52925 0.994
Attrition
count mean std min 25% 50% 75% max
Attrition_Flag 1627.0 1.000000 0.000000 1.0 1.0000 1.000 1.000 1.000
Customer_Age 1627.0 46.659496 7.665652 26.0 41.0000 47.000 52.000 68.000
Dependent_count 1627.0 2.402581 1.275010 0.0 2.0000 2.000 3.000 5.000
Education_Level 1371.0 3.128373 1.465269 1.0 2.0000 3.000 4.000 6.000
Income_Category 1440.0 2.304861 1.387216 1.0 1.0000 2.000 4.000 5.000
Card_Category 1627.0 1.085433 0.349947 1.0 1.0000 1.000 1.000 4.000
Months_on_book 1627.0 36.178242 7.796548 13.0 32.0000 36.000 40.000 56.000
Total_Relationship_Count 1627.0 3.279656 1.577782 1.0 2.0000 3.000 5.000 6.000
Months_Inactive_12_mon 1627.0 2.693301 0.899623 0.0 2.0000 3.000 3.000 6.000
Contacts_Count_12_mon 1627.0 2.972342 1.090537 0.0 2.0000 3.000 4.000 6.000
Credit_Limit 1627.0 8136.039459 9095.334105 1438.3 2114.0000 4178.000 9933.500 34516.000
Total_Revolving_Bal 1627.0 672.822987 921.385582 0.0 0.0000 0.000 1303.500 2517.000
Avg_Open_To_Buy 1627.0 7463.216472 9109.208129 3.0 1587.0000 3488.000 9257.500 34516.000
Total_Amt_Chng_Q4_Q1 1627.0 0.694277 0.214924 0.0 0.5445 0.701 0.856 1.492
Total_Trans_Amt 1627.0 3095.025814 2308.227629 510.0 1903.5000 2329.000 2772.000 10583.000
Total_Trans_Ct 1627.0 44.933620 14.568429 10.0 37.0000 43.000 51.000 94.000
Total_Ct_Chng_Q4_Q1 1627.0 0.554386 0.226854 0.0 0.4000 0.531 0.692 2.500
Avg_Utilization_Ratio 1627.0 0.162475 0.264458 0.0 0.0000 0.000 0.231 0.999
Attrition customers tend to have a lower median:
And a higher median
KNNImputer: Each sample's missing values are imputed by looking at the n_neighbors nearest neighbors found in the training set. Default value for n_neighbors=5.The values obtained might not be integer always which is not be the best way to impute categorical values
# Now let's apply the log function on skewed attributes as a way to reduce the skew. This doesn't cause any data leakeage.
df["Credit_Limit"] = np.log(df["Credit_Limit"])
df["Avg_Open_To_Buy"] = np.log(df["Avg_Open_To_Buy"])
histogram_boxplot(df, "Credit_Limit")
histogram_boxplot(df, "Avg_Open_To_Buy")
df.isnull().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 1112 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
# defining a list with names of columns that will be used for imputation
reqd_col_for_impute = [
"Education_Level",
"Marital_Status",
"Income_Category"
]
df[reqd_col_for_impute].head()
| Education_Level | Marital_Status | Income_Category | |
|---|---|---|---|
| 0 | 2.0 | Married | 3.0 |
| 1 | 4.0 | Single | 1.0 |
| 2 | 4.0 | Married | 4.0 |
| 3 | 2.0 | NaN | 1.0 |
| 4 | 1.0 | Married | 3.0 |
# We will keep a copy of this data set before doing splitting.
data1 = df.copy()
data1.head()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45 | M | 3 | 2.0 | Married | 3.0 | 1 | 39 | 5 | 1 | 3 | 9.448648 | 777 | 9.385469 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 0 | 49 | F | 5 | 4.0 | Single | 1.0 | 1 | 44 | 6 | 1 | 2 | 9.018695 | 864 | 8.908154 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 0 | 51 | M | 3 | 4.0 | Married | 4.0 | 1 | 36 | 4 | 1 | 0 | 8.136811 | 0 | 8.136811 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 0 | 40 | F | 4 | 2.0 | NaN | 1.0 | 1 | 34 | 3 | 4 | 1 | 8.105609 | 2517 | 6.679599 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 0 | 40 | M | 3 | 1.0 | Married | 3.0 | 1 | 21 | 5 | 1 | 0 | 8.458716 | 0 | 8.458716 | 2.175 | 816 | 28 | 2.500 | 0.000 |
data1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null int64 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null object 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null float64 5 Marital_Status 9378 non-null object 6 Income_Category 9015 non-null float64 7 Card_Category 10127 non-null int64 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(7), int64(11), object(2) memory usage: 1.5+ MB
data1['Gender'].value_counts(dropna=False)
F 5358 M 4769 Name: Gender, dtype: int64
data1['Marital_Status'].value_counts(dropna=False)
Married 4687 Single 3943 NaN 749 Divorced 748 Name: Marital_Status, dtype: int64
# we need to pass numerical values for each categorical column for KNN imputation so we will label encode them
gender = {"M": 0, "F": 1}
data1["Gender"] = data1["Gender"].map(gender)
maritalstatus = {"Single": 1, "Married": 2,"Divorced":3}
data1["Marital_Status"] = data1["Marital_Status"].map(maritalstatus)
data1.head()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45 | 0 | 3 | 2.0 | 2.0 | 3.0 | 1 | 39 | 5 | 1 | 3 | 9.448648 | 777 | 9.385469 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 0 | 49 | 1 | 5 | 4.0 | 1.0 | 1.0 | 1 | 44 | 6 | 1 | 2 | 9.018695 | 864 | 8.908154 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 0 | 51 | 0 | 3 | 4.0 | 2.0 | 4.0 | 1 | 36 | 4 | 1 | 0 | 8.136811 | 0 | 8.136811 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 0 | 40 | 1 | 4 | 2.0 | NaN | 1.0 | 1 | 34 | 3 | 4 | 1 | 8.105609 | 2517 | 6.679599 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 0 | 40 | 0 | 3 | 1.0 | 2.0 | 3.0 | 1 | 21 | 5 | 1 | 0 | 8.458716 | 0 | 8.458716 | 2.175 | 816 | 28 | 2.500 | 0.000 |
data1['Gender'].value_counts(dropna=False)
1 5358 0 4769 Name: Gender, dtype: int64
data1['Marital_Status'].value_counts(dropna=False)
2.0 4687 1.0 3943 NaN 749 3.0 748 Name: Marital_Status, dtype: int64
X = data1.drop(["Attrition_Flag"], axis=1)
y = data1["Attrition_Flag"]
# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 19) (2026, 19) (2026, 19)
print("Number of rows in train data =", X_train.shape[0])
print("Number of rows in validation data =", X_val.shape[0])
print("Number of rows in test data =", X_test.shape[0])
Number of rows in train data = 6075 Number of rows in validation data = 2026 Number of rows in test data = 2026
Imputing Missing Values
imputer cell here, placing it earlier makes confusion in the learners.X_val[reqd_col_for_impute] = imputer.transform(X_val[reqd_col_for_impute])imputer = KNNImputer(n_neighbors=5)
# Fit and transform the train data
X_train[reqd_col_for_impute] = imputer.fit_transform(X_train[reqd_col_for_impute])
# Transform the val data
X_val[reqd_col_for_impute] = imputer.transform(X_val[reqd_col_for_impute])
# Transform the test data
X_test[reqd_col_for_impute] = imputer.transform(X_test[reqd_col_for_impute])
# Checking that no column has missing values in train, validation or test sets
print(X_train.isna().sum())
print("-" * 30)
print(X_val.isna().sum())
print("-" * 30)
print(X_test.isna().sum())
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
## Function to inverse the encoding
def inverse_mapping(x, y):
inv_dict = {v: k for k, v in x.items()}
X_train[y] = np.round(X_train[y]).map(inv_dict).astype("category")
X_val[y] = np.round(X_val[y]).map(inv_dict).astype("category")
X_test[y] = np.round(X_test[y]).map(inv_dict).astype("category")
inverse_mapping(gender, "Gender")
inverse_mapping(maritalstatus, "Marital_Status")
cols = X_train.select_dtypes(include=["object", "category"])
for i in cols.columns:
print(X_train[i].value_counts(dropna=False))
print("*" * 30)
F 3193 M 2882 Name: Gender, dtype: int64 ****************************** Married 3129 Single 2516 Divorced 430 Name: Marital_Status, dtype: int64 ******************************
cols = X_val.select_dtypes(include=["object", "category"])
for i in cols.columns:
print(X_val[i].value_counts())
print("*" * 30)
F 1095 M 931 Name: Gender, dtype: int64 ****************************** Married 1056 Single 814 Divorced 156 Name: Marital_Status, dtype: int64 ******************************
cols = X_test.select_dtypes(include=["object", "category"])
for i in cols.columns:
print(X_test[i].value_counts())
print("*" * 30)
F 1070 M 956 Name: Gender, dtype: int64 ****************************** Married 1010 Single 854 Divorced 162 Name: Marital_Status, dtype: int64 ******************************
# Based on the boxplots earlier, a few attributes could do with some outlier fixes. We will cap any high outlier value to Q3+(1.5*IQR
# Note that outlier treatment is being done separately for train/val/test datasets in order to avoid data leakage.
cols = ['Customer_Age','Months_on_book','Months_Inactive_12_mon','Contacts_Count_12_mon','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1','Total_Trans_Amt','Total_Trans_Ct','Total_Ct_Chng_Q4_Q1']
for col in cols:
Q1 = X_train[col].quantile(0.25)
Q3 = X_train[col].quantile(0.75)
IQR = Q3 - Q1
X_train[col] = np.where(X_train[col] > (Q3+(1.5*IQR)),(Q3+(1.5*IQR)),X_train[col])
Q1 = X_val[col].quantile(0.25)
Q3 = X_val[col].quantile(0.75)
IQR = Q3 - Q1
X_val[col] = np.where(X_val[col] > (Q3+(1.5*IQR)),(Q3+(1.5*IQR)),X_val[col])
Q1 = X_test[col].quantile(0.25)
Q3 = X_test[col].quantile(0.75)
IQR = Q3 - Q1
X_test[col] = np.where(X_test[col] > (Q3+(1.5*IQR)),(Q3+(1.5*IQR)),X_test[col])
X_train = pd.get_dummies(X_train, drop_first=True)
X_val = pd.get_dummies(X_val, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 20) (2026, 20) (2026, 20)
# Print the target variable's distribution in the training and test datasets.
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))
print(y_val.value_counts(normalize=True))
0 0.839342 1 0.160658 Name: Attrition_Flag, dtype: float64 0 0.839585 1 0.160415 Name: Attrition_Flag, dtype: float64 0 0.839092 1 0.160908 Name: Attrition_Flag, dtype: float64
print("\nDistribution of values for Months_on_book=36 customers\n")
print(df[df["Months_on_book"]==36].describe().T)
print("\nDistribution of values for Months_on_book != 36 customers\n")
print(df[df["Months_on_book"]!=36].describe().T)
Distribution of values for Months_on_book=36 customers
count mean std min 25% 50% 75% max
Attrition_Flag 2463.0 0.174584 0.379688 0.000000 0.000000 0.000000 0.000000 1.000000
Customer_Age 2463.0 46.155907 7.606811 26.000000 41.000000 46.000000 51.000000 73.000000
Dependent_count 2463.0 2.404791 1.304467 0.000000 1.000000 2.000000 3.000000 5.000000
Education_Level 2076.0 3.014933 1.425167 1.000000 2.000000 3.000000 4.000000 6.000000
Income_Category 2187.0 2.258345 1.343044 1.000000 1.000000 2.000000 3.000000 5.000000
Card_Category 2463.0 1.084856 0.333156 1.000000 1.000000 1.000000 1.000000 4.000000
Months_on_book 2463.0 36.000000 0.000000 36.000000 36.000000 36.000000 36.000000 36.000000
Total_Relationship_Count 2463.0 3.782379 1.563099 1.000000 3.000000 4.000000 5.000000 6.000000
Months_Inactive_12_mon 2463.0 2.346326 0.966032 0.000000 2.000000 2.000000 3.000000 6.000000
Contacts_Count_12_mon 2463.0 2.427933 1.105146 0.000000 2.000000 2.000000 3.000000 6.000000
Credit_Limit 2463.0 8.581531 0.935247 7.271217 7.830426 8.383662 9.264307 10.449178
Total_Revolving_Bal 2463.0 1151.958587 824.045454 0.000000 0.000000 1264.000000 1778.000000 2517.000000
Avg_Open_To_Buy 2463.0 8.135428 1.316965 2.302585 7.157342 8.106515 9.155198 10.449178
Total_Amt_Chng_Q4_Q1 2463.0 0.755181 0.222819 0.000000 0.627000 0.733000 0.853000 3.397000
Total_Trans_Amt 2463.0 4498.042225 3452.387799 510.000000 2171.500000 3978.000000 4784.500000 18484.000000
Total_Trans_Ct 2463.0 65.209907 23.540643 10.000000 45.000000 68.000000 81.000000 134.000000
Total_Ct_Chng_Q4_Q1 2463.0 0.712825 0.245057 0.000000 0.579000 0.698000 0.820000 3.250000
Avg_Utilization_Ratio 2463.0 0.276601 0.279178 0.000000 0.000000 0.178000 0.504500 0.995000
Distribution of values for Months_on_book != 36 customers
count mean std min 25% 50% 75% max
Attrition_Flag 7664.0 0.156185 0.363054 0.000000 0.000000 0.000000 0.000000 1.000000
Customer_Age 7664.0 46.380611 8.143922 26.000000 41.000000 46.000000 52.000000 70.000000
Dependent_count 7664.0 2.327375 1.296640 0.000000 1.000000 2.000000 3.000000 5.000000
Education_Level 6532.0 3.075781 1.408653 1.000000 2.000000 3.000000 4.000000 6.000000
Income_Category 6828.0 2.370094 1.359707 1.000000 1.000000 2.000000 4.000000 5.000000
Card_Category 7664.0 1.083246 0.334007 1.000000 1.000000 1.000000 1.000000 4.000000
Months_on_book 7664.0 35.905402 9.180491 13.000000 29.000000 37.000000 42.000000 56.000000
Total_Relationship_Count 7664.0 3.822286 1.551582 1.000000 3.000000 4.000000 5.000000 6.000000
Months_Inactive_12_mon 7664.0 2.339509 1.024596 0.000000 2.000000 2.000000 3.000000 6.000000
Contacts_Count_12_mon 7664.0 2.464118 1.106500 0.000000 2.000000 2.000000 3.000000 6.000000
Credit_Limit 7664.0 8.610444 0.933246 7.271217 7.849226 8.432397 9.328168 10.449178
Total_Revolving_Bal 7664.0 1166.302714 812.078213 0.000000 461.000000 1279.500000 1785.000000 2517.000000
Avg_Open_To_Buy 7664.0 8.172961 1.309581 1.098612 7.200051 8.163941 9.210465 10.449178
Total_Amt_Chng_Q4_Q1 7664.0 0.761470 0.218026 0.000000 0.631000 0.738000 0.862000 2.675000
Total_Trans_Amt 7664.0 4373.891441 3378.851839 530.000000 2151.750000 3878.000000 4726.250000 17995.000000
Total_Trans_Ct 7664.0 64.745825 23.451074 10.000000 45.000000 67.000000 80.000000 139.000000
Total_Ct_Chng_Q4_Q1 7664.0 0.712029 0.235818 0.000000 0.583000 0.703000 0.818000 3.714000
Avg_Utilization_Ratio 7664.0 0.274345 0.274578 0.000000 0.026000 0.175000 0.502000 0.999000
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
Let's start by building different models using KFold and cross_val_score and tune the best model using GridSearchCV and RandomizedSearchCV
Stratified K-Folds cross-validation provides dataset indices to split data into train/validation sets. Split dataset into k consecutive folds (without shuffling by default) keeping the distribution of both classes in each fold the same as the target variable. Each fold is then used once as validation while the k - 1 remaining folds form the training set.models = [] # Empty list to store all the models
# Appending models into the list
m_bagging = BaggingClassifier(random_state=1)
models.append(("Bagging", m_bagging))
m_randomforest = RandomForestClassifier(random_state=1)
models.append(("Random forest", m_randomforest))
m_gbm = GradientBoostingClassifier(random_state=1)
models.append(("GBM", m_gbm))
m_adaboost = AdaBoostClassifier(random_state=1)
models.append(("Adaboost", m_adaboost))
m_xgboost = XGBClassifier(random_state=1, eval_metric="logloss")
models.append(("Xgboost", m_xgboost))
m_dtree = DecisionTreeClassifier(random_state=1)
models.append(("dtree", m_dtree))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models:
model.fit(X_train, y_train)
scores = recall_score(y_train, model.predict(X_train)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 78.27629513343798 Random forest: 77.25065410779696 GBM: 81.55416012558871 Adaboost: 81.55311355311355 Xgboost: 85.75614861329146 dtree: 78.37833594976452 Training Performance: Bagging: 98.25819672131148 Random forest: 100.0 GBM: 87.90983606557377 Adaboost: 83.81147540983606 Xgboost: 100.0 dtree: 100.0
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
print("Before Oversampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Oversampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After Oversampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After Oversampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After Oversampling, the shape of train_X: {}".format(X_train_over.shape))
print("After Oversampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before Oversampling, counts of label 'Yes': 976 Before Oversampling, counts of label 'No': 5099 After Oversampling, counts of label 'Yes': 5099 After Oversampling, counts of label 'No': 5099 After Oversampling, the shape of train_X: (10198, 20) After Oversampling, the shape of train_y: (10198,)
models_over = [] # Empty list to store all the oversampled models
# Appending models into the list
m_bagging_over = BaggingClassifier(random_state=1)
models_over.append(("Bagging", m_bagging_over))
m_randomforest_over = RandomForestClassifier(random_state=1)
models_over.append(("Random forest", m_randomforest_over))
m_gbm_over = GradientBoostingClassifier(random_state=1)
models_over.append(("GBM", m_gbm_over))
m_adaboost_over = AdaBoostClassifier(random_state=1)
models_over.append(("Adaboost", m_adaboost_over))
m_xgboost_over = XGBClassifier(random_state=1, eval_metric="logloss")
models_over.append(("Xgboost", m_xgboost_over))
m_dtree_over = DecisionTreeClassifier(random_state=1)
models_over.append(("dtree", m_dtree_over))
results_over = [] # Empty list to store all model's CV scores
names_over = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Oversampling case - Cross-Validation Performance:" "\n")
for name, model in models_over:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
results_over.append(cv_result)
names_over.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Oversampling case - Training Performance:" "\n")
for name, model in models_over:
model.fit(X_train_over, y_train_over)
scores = recall_score(y_train_over, model.predict(X_train_over)) * 100
print("{}: {}".format(name, scores))
Oversampling case - Cross-Validation Performance: Bagging: 96.52871904404549 Random forest: 98.01918451384479 GBM: 97.50936135003559 Adaboost: 96.7838519117166 Xgboost: 98.4898882025823 dtree: 95.48929169312474 Oversampling case - Training Performance: Bagging: 99.84310649146892 Random forest: 100.0 GBM: 98.2349480290253 Adaboost: 97.17591684644049 Xgboost: 100.0 dtree: 100.0
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Oversampling case - Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results_over)
ax.set_xticklabels(names_over)
plt.show()
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Undersampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Undersampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Undersampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Undersampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Undersampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Undersampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Undersampling, counts of label 'Yes': 976 Before Undersampling, counts of label 'No': 5099 After Undersampling, counts of label 'Yes': 976 After Undersampling, counts of label 'No': 976 After Undersampling, the shape of train_X: (1952, 20) After Undersampling, the shape of train_y: (1952,)
models_un = [] # Empty list to store all the oversampled models
# Appending models into the list
m_bagging_un = BaggingClassifier(random_state=1)
models_un.append(("Bagging", m_bagging_un))
m_randomforest_un = RandomForestClassifier(random_state=1)
models_un.append(("Random forest", m_randomforest_un))
m_gbm_un = GradientBoostingClassifier(random_state=1)
models_un.append(("GBM", m_gbm_un))
m_adaboost_un = AdaBoostClassifier(random_state=1)
models_un.append(("Adaboost", m_adaboost_un))
m_xgboost_un = XGBClassifier(random_state=1, eval_metric="logloss")
models_un.append(("Xgboost", m_xgboost_un))
m_dtree_un = DecisionTreeClassifier(random_state=1)
models_un.append(("dtree", m_dtree_un))
results_un = [] # Empty list to store all model's CV scores
names_un = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Undersampling case - Cross-Validation Performance:" "\n")
for name, model in models_un:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
results_un.append(cv_result)
names_un.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Undersampling case - Training Performance:" "\n")
for name, model in models_un:
model.fit(X_train_un, y_train_un)
scores = recall_score(y_train_un, model.predict(X_train_un)) * 100
print("{}: {}".format(name, scores))
Undersampling case - Cross-Validation Performance: Bagging: 90.06227106227107 Random forest: 93.65044479330194 GBM: 94.05808477237049 Adaboost: 92.52276295133439 Xgboost: 95.18472004186289 dtree: 88.31815803244375 Undersampling case - Training Performance: Bagging: 99.48770491803278 Random forest: 100.0 GBM: 98.15573770491804 Adaboost: 94.77459016393442 Xgboost: 100.0 dtree: 100.0
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Undersampling case - Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results_un)
ax.set_xticklabels(names_un)
plt.show()
# Calculating different metrics on train set
model_train_perf = []
model_val_perf = []
for myIter in range(len(models)):
log_reg_model_train_perf = model_performance_classification_sklearn(models[myIter][1], X_train, y_train)
print("Training performance for:",models[myIter][0])
print(log_reg_model_train_perf)
model_train_perf.append(copy.deepcopy(log_reg_model_train_perf))
print("")
log_reg_model_val_perf = model_performance_classification_sklearn(models[myIter][1], X_val, y_val)
print("Validation performance for:",models[myIter][0])
print(log_reg_model_val_perf)
model_val_perf.append(copy.deepcopy(log_reg_model_val_perf))
print("")
Training performance for: Bagging Accuracy Recall Precision F1 0 0.996543 0.982582 0.995846 0.98917 Validation performance for: Bagging Accuracy Recall Precision F1 0 0.952616 0.809816 0.885906 0.846154 Training performance for: Random forest Accuracy Recall Precision F1 0 1.0 1.0 1.0 1.0 Validation performance for: Random forest Accuracy Recall Precision F1 0 0.959033 0.809816 0.926316 0.864157 Training performance for: GBM Accuracy Recall Precision F1 0 0.973827 0.879098 0.954394 0.9152 Validation performance for: GBM Accuracy Recall Precision F1 0 0.96693 0.852761 0.936027 0.892456 Training performance for: Adaboost Accuracy Recall Precision F1 0 0.957695 0.838115 0.892039 0.864237 Validation performance for: Adaboost Accuracy Recall Precision F1 0 0.959526 0.855828 0.888535 0.871875 Training performance for: Xgboost Accuracy Recall Precision F1 0 1.0 1.0 1.0 1.0 Validation performance for: Xgboost Accuracy Recall Precision F1 0 0.969398 0.877301 0.928571 0.902208 Training performance for: dtree Accuracy Recall Precision F1 0 1.0 1.0 1.0 1.0 Validation performance for: dtree Accuracy Recall Precision F1 0 0.940276 0.803681 0.821317 0.812403
# Calculating different metrics on train set
for myIter in range(len(models)):
print("Confusion matrix for:",models[myIter][0])
confusion_matrix_sklearn(models[myIter][1], X_val, y_val)
print("")
Confusion matrix for: Bagging Confusion matrix for: Random forest Confusion matrix for: GBM Confusion matrix for: Adaboost Confusion matrix for: Xgboost Confusion matrix for: dtree
# Calculating different metrics on train set
model_train_perf_over = []
model_val_perf_over = []
for myIter in range(len(models_over)):
log_reg_model_train_perf = model_performance_classification_sklearn(models_over[myIter][1], X_train_over, y_train_over)
print("Oversampling case - Training performance for:",models_over[myIter][0])
print(log_reg_model_train_perf)
model_train_perf_over.append(copy.deepcopy(log_reg_model_train_perf))
print("")
log_reg_model_val_perf = model_performance_classification_sklearn(models_over[myIter][1], X_val, y_val)
print("Oversampling case - Validation performance for:",models_over[myIter][0])
print(log_reg_model_val_perf)
model_val_perf_over.append(copy.deepcopy(log_reg_model_val_perf))
print("")
Oversampling case - Training performance for: Bagging Accuracy Recall Precision F1 0 0.998725 0.998431 0.999019 0.998725 Oversampling case - Validation performance for: Bagging Accuracy Recall Precision F1 0 0.946693 0.843558 0.828313 0.835866 Oversampling case - Training performance for: Random forest Accuracy Recall Precision F1 0 1.0 1.0 1.0 1.0 Oversampling case - Validation performance for: Random forest Accuracy Recall Precision F1 0 0.950642 0.828221 0.859873 0.84375 Oversampling case - Training performance for: GBM Accuracy Recall Precision F1 0 0.981369 0.982349 0.980427 0.981387 Oversampling case - Validation performance for: GBM Accuracy Recall Precision F1 0 0.963475 0.895706 0.879518 0.887538 Oversampling case - Training performance for: Adaboost Accuracy Recall Precision F1 0 0.96568 0.971759 0.960085 0.965887 Oversampling case - Validation performance for: Adaboost Accuracy Recall Precision F1 0 0.948667 0.895706 0.80663 0.848837 Oversampling case - Training performance for: Xgboost Accuracy Recall Precision F1 0 1.0 1.0 1.0 1.0 Oversampling case - Validation performance for: Xgboost Accuracy Recall Precision F1 0 0.969398 0.898773 0.909938 0.904321 Oversampling case - Training performance for: dtree Accuracy Recall Precision F1 0 1.0 1.0 1.0 1.0 Oversampling case - Validation performance for: dtree Accuracy Recall Precision F1 0 0.924482 0.788344 0.753666 0.770615
# Calculating different metrics on train set
for myIter in range(len(models_over)):
print("Oversampling case - Confusion matrix for:",models_over[myIter][0])
confusion_matrix_sklearn(models_over[myIter][1], X_val, y_val)
print("")
Oversampling case - Confusion matrix for: Bagging Oversampling case - Confusion matrix for: Random forest Oversampling case - Confusion matrix for: GBM Oversampling case - Confusion matrix for: Adaboost Oversampling case - Confusion matrix for: Xgboost Oversampling case - Confusion matrix for: dtree
# Calculating different metrics on train set
model_train_perf_un = []
model_val_perf_un = []
for myIter in range(len(models)):
log_reg_model_train_perf = model_performance_classification_sklearn(models_un[myIter][1], X_train_un, y_train_un)
print("Undersampling case - Training performance for:",models_un[myIter][0])
model_train_perf_un.append(copy.deepcopy(log_reg_model_train_perf))
print(log_reg_model_train_perf)
print("")
log_reg_model_val_perf = model_performance_classification_sklearn(models_un[myIter][1], X_val, y_val)
print("Undersampling case - Validation performance for:",models_un[myIter][0])
print(log_reg_model_val_perf)
model_val_perf_un.append(copy.deepcopy(log_reg_model_val_perf))
print("")
Undersampling case - Training performance for: Bagging Accuracy Recall Precision F1 0 0.996414 0.994877 0.997945 0.996408 Undersampling case - Validation performance for: Bagging Accuracy Recall Precision F1 0 0.923001 0.923313 0.696759 0.794195 Undersampling case - Training performance for: Random forest Accuracy Recall Precision F1 0 1.0 1.0 1.0 1.0 Undersampling case - Validation performance for: Random forest Accuracy Recall Precision F1 0 0.93386 0.935583 0.729665 0.819892 Undersampling case - Training performance for: GBM Accuracy Recall Precision F1 0 0.975922 0.981557 0.970618 0.976057 Undersampling case - Validation performance for: GBM Accuracy Recall Precision F1 0 0.938302 0.96319 0.735363 0.833997 Undersampling case - Training performance for: Adaboost Accuracy Recall Precision F1 0 0.942623 0.947746 0.938134 0.942915 Undersampling case - Validation performance for: Adaboost Accuracy Recall Precision F1 0 0.928924 0.95092 0.707763 0.811518 Undersampling case - Training performance for: Xgboost Accuracy Recall Precision F1 0 1.0 1.0 1.0 1.0 Undersampling case - Validation performance for: Xgboost Accuracy Recall Precision F1 0 0.941757 0.969325 0.745283 0.842667 Undersampling case - Training performance for: dtree Accuracy Recall Precision F1 0 1.0 1.0 1.0 1.0 Undersampling case - Validation performance for: dtree Accuracy Recall Precision F1 0 0.892892 0.917178 0.611452 0.733742
# Calculating different metrics on train set
for myIter in range(len(models_un)):
print("Undersampling case - Confusion matrix for:",models_un[myIter][0])
confusion_matrix_sklearn(models_un[myIter][1], X_val, y_val)
print("")
Undersampling case - Confusion matrix for: Bagging Undersampling case - Confusion matrix for: Random forest Undersampling case - Confusion matrix for: GBM Undersampling case - Confusion matrix for: Adaboost Undersampling case - Confusion matrix for: Xgboost Undersampling case - Confusion matrix for: dtree
Lets try:
a) Regularization to see if overfitting can be reduced
b) Undersampling the train to handle the imbalance between classes and check the model performance.
# training performance comparison
print("\nTraining performance summary")
models_train_comp_df = pd.concat(
[
model_train_perf[0].T,
model_train_perf[1].T,
model_train_perf[2].T,
model_train_perf[3].T,
model_train_perf[4].T,
model_train_perf[5].T,
model_train_perf_over[0].T,
model_train_perf_over[1].T,
model_train_perf_over[2].T,
model_train_perf_over[3].T,
model_train_perf_over[4].T,
model_train_perf_over[5].T,
model_train_perf_un[0].T,
model_train_perf_un[1].T,
model_train_perf_un[2].T,
model_train_perf_un[3].T,
model_train_perf_un[4].T,
model_train_perf_un[5].T
],
axis=1,
)
models_train_comp_df.columns = [
models[0][0],
models[1][0],
models[2][0],
models[3][0],
models[4][0],
models[5][0],
models[0][0]+" oversampled",
models[1][0]+" oversampled",
models[2][0]+" oversampled",
models[3][0]+" oversampled",
models[4][0]+" oversampled",
models[5][0]+" oversampled",
models[0][0]+" undersampled",
models[1][0]+" undersampled",
models[2][0]+" undersampled",
models[3][0]+" undersampled",
models[4][0]+" undersampled",
models[5][0]+" undersampled"
]
models_train_comp_df.T
Training performance summary
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| Bagging | 0.996543 | 0.982582 | 0.995846 | 0.989170 |
| Random forest | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| GBM | 0.973827 | 0.879098 | 0.954394 | 0.915200 |
| Adaboost | 0.957695 | 0.838115 | 0.892039 | 0.864237 |
| Xgboost | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| dtree | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| Bagging oversampled | 0.998725 | 0.998431 | 0.999019 | 0.998725 |
| Random forest oversampled | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| GBM oversampled | 0.981369 | 0.982349 | 0.980427 | 0.981387 |
| Adaboost oversampled | 0.965680 | 0.971759 | 0.960085 | 0.965887 |
| Xgboost oversampled | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| dtree oversampled | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| Bagging undersampled | 0.996414 | 0.994877 | 0.997945 | 0.996408 |
| Random forest undersampled | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| GBM undersampled | 0.975922 | 0.981557 | 0.970618 | 0.976057 |
| Adaboost undersampled | 0.942623 | 0.947746 | 0.938134 | 0.942915 |
| Xgboost undersampled | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| dtree undersampled | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
# validation dataset performance comparison
print("\nValidation dataset performance summary")
models_val_comp_df = pd.concat(
[
model_val_perf[0].T,
model_val_perf[1].T,
model_val_perf[2].T,
model_val_perf[3].T,
model_val_perf[4].T,
model_val_perf[5].T,
model_val_perf_over[0].T,
model_val_perf_over[1].T,
model_val_perf_over[2].T,
model_val_perf_over[3].T,
model_val_perf_over[4].T,
model_val_perf_over[5].T,
model_val_perf_un[0].T,
model_val_perf_un[1].T,
model_val_perf_un[2].T,
model_val_perf_un[3].T,
model_val_perf_un[4].T,
model_val_perf_un[5].T
],
axis=1,
)
models_val_comp_df.columns = [
models[0][0],
models[1][0],
models[2][0],
models[3][0],
models[4][0],
models[5][0],
models[0][0]+" oversampled",
models[1][0]+" oversampled",
models[2][0]+" oversampled",
models[3][0]+" oversampled",
models[4][0]+" oversampled",
models[5][0]+" oversampled",
models[0][0]+" undersampled",
models[1][0]+" undersampled",
models[2][0]+" undersampled",
models[3][0]+" undersampled",
models[4][0]+" undersampled",
models[5][0]+" undersampled"
]
models_val_comp_df.T
Validation dataset performance summary
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| Bagging | 0.952616 | 0.809816 | 0.885906 | 0.846154 |
| Random forest | 0.959033 | 0.809816 | 0.926316 | 0.864157 |
| GBM | 0.966930 | 0.852761 | 0.936027 | 0.892456 |
| Adaboost | 0.959526 | 0.855828 | 0.888535 | 0.871875 |
| Xgboost | 0.969398 | 0.877301 | 0.928571 | 0.902208 |
| dtree | 0.940276 | 0.803681 | 0.821317 | 0.812403 |
| Bagging oversampled | 0.946693 | 0.843558 | 0.828313 | 0.835866 |
| Random forest oversampled | 0.950642 | 0.828221 | 0.859873 | 0.843750 |
| GBM oversampled | 0.963475 | 0.895706 | 0.879518 | 0.887538 |
| Adaboost oversampled | 0.948667 | 0.895706 | 0.806630 | 0.848837 |
| Xgboost oversampled | 0.969398 | 0.898773 | 0.909938 | 0.904321 |
| dtree oversampled | 0.924482 | 0.788344 | 0.753666 | 0.770615 |
| Bagging undersampled | 0.923001 | 0.923313 | 0.696759 | 0.794195 |
| Random forest undersampled | 0.933860 | 0.935583 | 0.729665 | 0.819892 |
| GBM undersampled | 0.938302 | 0.963190 | 0.735363 | 0.833997 |
| Adaboost undersampled | 0.928924 | 0.950920 | 0.707763 | 0.811518 |
| Xgboost undersampled | 0.941757 | 0.969325 | 0.745283 | 0.842667 |
| dtree undersampled | 0.892892 | 0.917178 | 0.611452 | 0.733742 |
Note: We considered the recall metric to pick the best models
**We will tune Adaboost, Xgboost and GBM models using RandomizedSearchCV.
First, let's create two functions to calculate different metrics and confusion matrix so that we don't have to use the same code repeatedly for each model.
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
%%time
# defining model
model = AdaBoostClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"n_estimators": np.arange(10, 110, 10),
"learning_rate": [0.1, 0.01, 0.2, 0.05, 1, 1.1, 1.2],
"base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
DecisionTreeClassifier(max_depth=4, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_un,y_train_un)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
Best parameters are {'n_estimators': 100, 'learning_rate': 1, 'base_estimator': DecisionTreeClassifier(max_depth=2, random_state=1)} with CV score=0.950821559392988:
CPU times: user 906 ms, sys: 192 ms, total: 1.1 s
Wall time: 18.1 s
# building model with best parameters
adb_tuned2 = AdaBoostClassifier(
n_estimators=100,
learning_rate=1,
random_state=1,
base_estimator=DecisionTreeClassifier(max_depth=2, random_state=1),
)
# Fit the model on training data
adb_tuned2.fit(X_train_un, y_train_un)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
random_state=1),
learning_rate=1, n_estimators=100, random_state=1)
# Calculating different metrics on train set
Adaboost_random_train = model_performance_classification_sklearn(
adb_tuned2, X_train_un, y_train_un
)
print("Training performance:")
Adaboost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
Adaboost_random_val = model_performance_classification_sklearn(adb_tuned2, X_val, y_val)
print("Validation performance:")
Adaboost_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.944719 | 0.960123 | 0.759709 | 0.848238 |
# creating confusion matrix
confusion_matrix_sklearn(adb_tuned2, X_val, y_val)
Now, we turn to the 2nd of the three models chosen..
%%time
# defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')
# Parameter grid to pass in RandomizedSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
'scale_pos_weight':[2,5,10,15,20],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10,15, 20]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
xgb_tuned2 = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
xgb_tuned2.fit(X_train_un,y_train_un)
print("Best parameters are {} with CV score={}:" .format(xgb_tuned2.best_params_,xgb_tuned2.best_score_))
Best parameters are {'subsample': 0.9, 'scale_pos_weight': 15, 'reg_lambda': 20, 'n_estimators': 100, 'max_depth': 2, 'learning_rate': 0.01, 'gamma': 0} with CV score=1.0:
CPU times: user 1.24 s, sys: 88.6 ms, total: 1.32 s
Wall time: 7.39 s
# building model with best parameters
xgb_tuned2 = XGBClassifier(
random_state=1,
n_estimators=100,
scale_pos_weight=15,
gamma=0,
subsample=0.9,
learning_rate=0.01,
max_depth=2,
reg_lambda=20,
)
# Fit the model on training data
xgb_tuned2.fit(X_train_un, y_train_un)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=2, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=100,
n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=1,
reg_alpha=0, reg_lambda=20, ...)
# Calculating different metrics on train set
xgboost_random_train = model_performance_classification_sklearn(
xgb_tuned2, X_train_un, y_train_un
)
print("Training performance:")
xgboost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.634734 | 1.0 | 0.577857 | 0.732458 |
# Calculating different metrics on validation set
xgboost_random_val = model_performance_classification_sklearn(xgb_tuned2, X_val, y_val)
print("Validation performance:")
xgboost_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.389931 | 1.0 | 0.208707 | 0.345339 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned2, X_val, y_val)
%%time
# defining model
model = GradientBoostingClassifier(random_state=1)
#Parameter grid to pass in GridSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
"learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.2],
'subsample':[0.5, 0.6, 0.8, 0.9, 1.0],
'max_depth':np.arange(1,5,1),
"min_samples_split": np.linspace(0.1, 1, 12),
"min_samples_leaf": np.linspace(0.1, 1, 12)
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
gbm_tuned2 = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
gbm_tuned2.fit(X_train_un,y_train_un)
print("Best parameters are {} with CV score={}:" .format(gbm_tuned2.best_params_,gbm_tuned2.best_score_))
Best parameters are {'subsample': 0.6, 'n_estimators': 50, 'min_samples_split': 0.6727272727272727, 'min_samples_leaf': 0.42727272727272725, 'max_depth': 2, 'learning_rate': 0.025} with CV score=1.0:
CPU times: user 224 ms, sys: 19.5 ms, total: 243 ms
Wall time: 2.25 s
# building model with best parameters
gbm_tuned2 = GradientBoostingClassifier(
random_state=1,
n_estimators=50,
min_samples_split=0.6727272727272727,
min_samples_leaf=0.42727272727272725,
max_depth=2,
learning_rate=0.025,
subsample=0.6
)
# Fit the model on training data
gbm_tuned2.fit(X_train_un, y_train_un)
GradientBoostingClassifier(learning_rate=0.025, max_depth=2,
min_samples_leaf=0.42727272727272725,
min_samples_split=0.6727272727272727,
n_estimators=50, random_state=1, subsample=0.6)
# Calculating different metrics on train set
gbmoost_random_train = model_performance_classification_sklearn(
gbm_tuned2, X_train_un, y_train_un
)
print("Training performance:")
gbmoost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.5 | 1.0 | 0.5 | 0.666667 |
# Calculating different metrics on validation set
gbm_random_val = model_performance_classification_sklearn(gbm_tuned2, X_val, y_val)
print("Validation performance:")
gbm_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.160908 | 1.0 | 0.160908 | 0.277211 |
# creating confusion matrix
confusion_matrix_sklearn(gbm_tuned2, X_val, y_val)
# training performance comparison
models_train_comp_df = pd.concat(
[
Adaboost_random_train.T,
xgboost_random_train.T,
gbmoost_random_train.T
],
axis=1,
)
models_train_comp_df.columns = [
"AdaBoost Tuned with Random search",
"Xgboost Tuned with Random Search",
"GBM Tuned with Random Search"
]
print("Training performance comparison:")
models_train_comp_df.T
Training performance comparison:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| AdaBoost Tuned with Random search | 1.000000 | 1.0 | 1.000000 | 1.000000 |
| Xgboost Tuned with Random Search | 0.634734 | 1.0 | 0.577857 | 0.732458 |
| GBM Tuned with Random Search | 0.500000 | 1.0 | 0.500000 | 0.666667 |
# Validation performance comparison
models_val_comp_df = pd.concat(
[
Adaboost_random_val.T,
xgboost_random_val.T,
gbm_random_val.T
],
axis=1,
)
models_val_comp_df.columns = [
"AdaBoost Tuned with Random search",
"Xgboost Tuned with Random Search",
"GBM Tuned with Random Search"
]
print("Validation performance comparison:")
models_val_comp_df.T
Validation performance comparison:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| AdaBoost Tuned with Random search | 0.944719 | 0.960123 | 0.759709 | 0.848238 |
| Xgboost Tuned with Random Search | 0.389931 | 1.000000 | 0.208707 | 0.345339 |
| GBM Tuned with Random Search | 0.160908 | 1.000000 | 0.160908 | 0.277211 |
# Calculating different metrics on the test set
xgboost_grid_test = model_performance_classification_sklearn(xgb_tuned2, X_test, y_test)
print("Test performance:")
xgboost_grid_test
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.367226 | 1.0 | 0.20224 | 0.336439 |
feature_names = X_train.columns
importances = xgb_tuned2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
For categorical columns, we will do one hot encoding and missing value imputation as pre-processing
We are doing missing value imputation for the whole data, so that if there is any missing value in the data in future that can be taken care of.
We will use the df dataframe since the data1 dataframe has already undergone the transforms and imputation.
df.head()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45 | M | 3 | 2.0 | Married | 3.0 | 1 | 39 | 5 | 1 | 3 | 9.448648 | 777 | 9.385469 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 0 | 49 | F | 5 | 4.0 | Single | 1.0 | 1 | 44 | 6 | 1 | 2 | 9.018695 | 864 | 8.908154 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 0 | 51 | M | 3 | 4.0 | Married | 4.0 | 1 | 36 | 4 | 1 | 0 | 8.136811 | 0 | 8.136811 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 0 | 40 | F | 4 | 2.0 | NaN | 1.0 | 1 | 34 | 3 | 4 | 1 | 8.105609 | 2517 | 6.679599 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 0 | 40 | M | 3 | 1.0 | Married | 3.0 | 1 | 21 | 5 | 1 | 0 | 8.458716 | 0 | 8.458716 | 2.175 | 816 | 28 | 2.500 | 0.000 |
df.isna().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 1112 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null int64 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null object 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null float64 5 Marital_Status 9378 non-null object 6 Income_Category 9015 non-null float64 7 Card_Category 10127 non-null int64 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(7), int64(11), object(2) memory usage: 1.5+ MB
# creating a list of numerical variables
numerical_features = df.select_dtypes(['int64','float64']).columns
numerical_features = numerical_features.drop('Attrition_Flag')
# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
#numeric_transformer = Pipeline(steps=[("imputer", KNNImputer(n_neighbors=5))])
# creating a list of categorical variables
categorical_features = df.select_dtypes(['object']).columns
# creating a transformer for categorical variables, which will first apply simple imputer and
#then do one hot encoding for categorical variables
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
#("imputer", KNNImputer(n_neighbors=5)),
("onehot", OneHotEncoder(handle_unknown="ignore")), # handle_unknown : {'error', 'ignore'}, default='error'
# Whether to raise an error or ignore if an unknown categorical feature
]
)
# handle_unknown = "ignore", allows model to handle any unknown category in the test data
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough", # remainder : {'drop', 'passthrough'} or estimator, default='drop'
# By default, only the specified columns in `transformers` are
# transformed and combined in the output, and the non-specified
# columns are dropped. (default of ``'drop'``).
# By specifying ``remainder='passthrough'``, all remaining columns that
# were not specified in `transformers` will be automatically passed
# through.
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
numeric_transformer.get_params('self')
{'memory': None,
'steps': [('imputer', SimpleImputer(strategy='median'))],
'verbose': False,
'imputer': SimpleImputer(strategy='median'),
'imputer__add_indicator': False,
'imputer__copy': True,
'imputer__fill_value': None,
'imputer__missing_values': nan,
'imputer__strategy': 'median',
'imputer__verbose': 0}
categorical_transformer.get_params('self')
{'memory': None,
'steps': [('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))],
'verbose': False,
'imputer': SimpleImputer(strategy='most_frequent'),
'onehot': OneHotEncoder(handle_unknown='ignore'),
'imputer__add_indicator': False,
'imputer__copy': True,
'imputer__fill_value': None,
'imputer__missing_values': nan,
'imputer__strategy': 'most_frequent',
'imputer__verbose': 0,
'onehot__categories': 'auto',
'onehot__drop': None,
'onehot__dtype': numpy.float64,
'onehot__handle_unknown': 'ignore',
'onehot__sparse': True}
preprocessor.get_params()
{'n_jobs': None,
'remainder': 'passthrough',
'sparse_threshold': 0.3,
'transformer_weights': None,
'transformers': [('num',
Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
Index(['Customer_Age', 'Dependent_count', 'Education_Level', 'Income_Category', 'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'], dtype='object')),
('cat',
Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))]),
Index(['Gender', 'Marital_Status'], dtype='object'))],
'verbose': False,
'verbose_feature_names_out': True,
'num': Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
'cat': Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))]),
'num__memory': None,
'num__steps': [('imputer', SimpleImputer(strategy='median'))],
'num__verbose': False,
'num__imputer': SimpleImputer(strategy='median'),
'num__imputer__add_indicator': False,
'num__imputer__copy': True,
'num__imputer__fill_value': None,
'num__imputer__missing_values': nan,
'num__imputer__strategy': 'median',
'num__imputer__verbose': 0,
'cat__memory': None,
'cat__steps': [('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))],
'cat__verbose': False,
'cat__imputer': SimpleImputer(strategy='most_frequent'),
'cat__onehot': OneHotEncoder(handle_unknown='ignore'),
'cat__imputer__add_indicator': False,
'cat__imputer__copy': True,
'cat__imputer__fill_value': None,
'cat__imputer__missing_values': nan,
'cat__imputer__strategy': 'most_frequent',
'cat__imputer__verbose': 0,
'cat__onehot__categories': 'auto',
'cat__onehot__drop': None,
'cat__onehot__dtype': numpy.float64,
'cat__onehot__handle_unknown': 'ignore',
'cat__onehot__sparse': True}
# Separating target variable and other variables
X = df.drop(columns="Attrition_Flag")
Y = df["Attrition_Flag"]
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(7088, 19) (3039, 19)
# Creating new pipeline with best model (chosen earlier)
mypipeline = Pipeline(
steps=[
("pre", preprocessor),
("XGB",XGBClassifier(
random_state=1,
n_estimators=100,
scale_pos_weight=15,
gamma=0,
subsample=0.9,
learning_rate=0.01,
max_depth=2,
reg_lambda=20
))
]
)
# Fit the model on training data
mypipeline.fit(X_train, y_train)
Pipeline(steps=[('pre',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median'))]),
Index(['Customer_Age', 'Dependent_count', 'Education_Level', 'Income_Category', 'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', '...
gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.01, max_bin=256,
max_cat_to_onehot=4, max_delta_step=0,
max_depth=2, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()',
n_estimators=100, n_jobs=0, num_parallel_tree=1,
predictor='auto', random_state=1, reg_alpha=0,
reg_lambda=20, ...))])
mypipeline.score(X_train, y_train)
0.6094808126410836
mypipeline.score(X_test, y_test)
0.5913129318854886